# -*- coding: utf-8 -*-

__author__ = 'chen'

import urllib.request
from bs4 import BeautifulSoup

file = open('../data/aprioriData/ball.txt', 'w+')
for i in range(1, 103):
    url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(i) + '.html'
    response = urllib.request.urlopen(url)
    html = response.read().decode('utf-8')
    soup = BeautifulSoup(html)
    table = soup.find('table')
    trSoup = BeautifulSoup(str(table))
    trs = trSoup.find_all('tr')
    count = 1
    print(i, '第',i,'页')
    for tr in trs:
        if count != 1 and count != 2:
            tdSoup = BeautifulSoup(str(tr))
            tds = tdSoup.find_all('td')
            tdnum = 1
            for td in tds:
                if tdnum == 3:
                    emSoup = BeautifulSoup(str(td))
                    ems = emSoup.find_all('em')
                    line = ''
                    for em in ems:
                        line += em.text.replace('\n', '') + ';'
                    file.write(line + '\n')
                tdnum += 1
        count += 1
